# encoding=utf-8
# https://segmentfault.com/a/1190000004434983
import jieba
import nltk
from nltk.corpus import PlaintextCorpusReader

seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
print("Full Mode: " + "/ ".join(seg_list))  # 全模式

seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
print("Default Mode: " + "/ ".join(seg_list))  # 精确模式

seg_list = jieba.cut("他来到了网易杭研大厦")  # 默认是精确模式
print(", ".join(seg_list))

seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造")  # 搜索引擎模式
print(", ".join(seg_list))

with open('jieba_data/comment2.txt') as f:
    tmp_line = f.read()
    print(type(tmp_line))
    assert isinstance(tmp_line, str)
    # tmp_line.encode()
    tmp_line_decode = tmp_line.encode('GBK')
    jieba_cut = jieba.cut(tmp_line_decode)
    ans = '/'.join(jieba_cut)
    # ans = ans.encode('utf-8')
    print(type(ans), '\n', ans)
    # with open('./comment5_forward_slash.txt', 'w') as f2:
    #     f2.write(ans)


corpus_root = './jieba_data'
allText = PlaintextCorpusReader(corpus_root, ['comment4.txt', 'comment5.txt'])  # 输入文本为utf-8

print(type(allText))

sinica_text = nltk.Text(allText.words())
mytexts = nltk.TextCollection(allText)

print(len(mytexts._texts))
print(len(mytexts))

the_set = set(sinica_text)
print(len(the_set))
for tmp in the_set:
    print(tmp, "tf", mytexts.tf(tmp, allText.raw(['comment4.txt'])), "idf", mytexts.idf(tmp), mytexts.tf_idf(tmp, allText.raw(['comment4.txt'])))
